We are going to run a Simple Linear Regression and Polynomial regression on a COVID-19 Dataset for Chicago by following the discussion in the below forum and will also research on whether we can find similar datasets for North Carolina and and how datasets for states differ from this Chicago, city-level dataset in their structure and data aggregation levels.
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("COVID-19_Daily_Testing.csv")
data.head()
print(data.info())
## Clean data
data['Cases'] = data['Cases'].str.replace(',', '')
data['Tests'] = data['Tests'].str.replace(',', '')
#data['People Not-Positive - Total'] = data['People Not-Positive - Total'].str.replace(',','')
#data['People Tested - Age 18-29'] = data['People Tested - Age 18-29'].str.replace(',','')
#data['People Tested - Age 30-39'] = data['People Tested - Age 30-39'].str.replace(',','')
#data['People Tested - Female'] = data['People Tested - Female'].str.replace(',','')
#data['People Tested - Male'] = data['People Tested - Male'].str.replace(',','')
#data['People Tested - Latinx'] = data['People Tested - Latinx'].str.replace(',','')
#data['People Tested - Unknown Race/Ethnicity'] = data['People Tested - Unknown Race/Ethnicity'].str.replace(',','')
#data['People Not-Positive - Age 18-29'] = data['People Not-Positive - Age 18-29'].str.replace(',','')
#data['People Not-Positive - Female'] = data['People Not-Positive - Female'].str.replace(',','')
#data['People Not-Positive - Male'] = data['People Not-Positive - Male'].str.replace(',','')
#data['People Not-Positive - Unknown Race/Ethnicity'] = data['People Not-Positive - Unknown Race/Ethnicity'].str.replace(',','')
data['Cases'] = pd.to_numeric(data['Cases'])
data['Tests'] = pd.to_numeric(data['Tests'])
#data['People Not-Positive - Total'] = pd.to_numeric(data['People Not-Positive - Total'])
#data['People Tested - Age 18-29'] = pd.to_numeric(data['People Tested - Age 18-29'])
#data['People Tested - Age 30-39'] = pd.to_numeric(data['People Tested - Age 30-39'])
#data['People Tested - Female'] = pd.to_numeric(data['People Tested - Female'])
#data['People Tested - Male'] = pd.to_numeric(data['People Tested - Male'])
#data['People Tested - Latinx'] = pd.to_numeric(data['People Tested - Latinx'])
#data['People Tested - Unknown Race/Ethnicity'] = pd.to_numeric(data['People Tested - Unknown Race/Ethnicity'])
#data['People Not-Positive - Age 18-29'] = pd.to_numeric(data['People Not-Positive - Age 18-29'])
#data['People Not-Positive - Female'] = pd.to_numeric(data['People Not-Positive - Female'])
#data['People Not-Positive - Male'] = pd.to_numeric(data['People Not-Positive - Male'])
#data['People Not-Positive - Unknown Race/Ethnicity'] = pd.to_numeric(data['People Not-Positive - Unknown Race/Ethnicity'])
data_numeric = data.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(20, 10))
sns.pairplot(data_numeric)
plt.show()
X = data['Tests'].values.reshape(-1,1)
y = data['Cases'].values.reshape(-1,1)
reg = LinearRegression()
reg.fit(X, y)
predictions = reg.predict(X)
print("The linear model is: Y = {:.5} + {:.5}X".format(reg.intercept_[0], reg.coef_[0][0]))
plt.figure(figsize=(16, 8))
plt.scatter(
X,
y,
c='black'
)
plt.plot(
X,
predictions,
c='blue',
linewidth=2
)
plt.xlabel("Tests")
plt.ylabel("Cases")
plt.show()
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y,predictions)))
poly = PolynomialFeatures(degree =4)
X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
lin2 = LinearRegression()
lin2.fit(X_poly, y)
pred = lin2.predict(X_poly)
new_X, new_y = zip(*sorted(zip(X, pred)))
plt.figure(figsize=(16, 8))
plt.scatter(
X,
y,
c='black'
)
plt.plot(
new_X, new_y,
c='blue'
)
plt.xlabel("Tests")
plt.ylabel("Cases")
plt.show()
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y,lin2.predict(poly.fit_transform(X)))))
Answer: yes, we did research for COVID-19 pandemic daily data and found state wise related information.We were able to downoad North Carolina related COVID-19 daily dataset from the following link : https://covidtracking.com/data/download (North Carolina: /api/v1/states/nc/daily.csv).
The first few observations of NC Covid dataset obtained is shown below.
data = pd.read_csv("NC_daily.csv")
data.head()